{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 9.4 GBDT案例实战 - 产品定价模型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 9.4.2 模型搭建"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1.读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>页数</th>\n",
       "      <th>类别</th>\n",
       "      <th>彩印</th>\n",
       "      <th>纸张</th>\n",
       "      <th>价格</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>207</td>\n",
       "      <td>技术类</td>\n",
       "      <td>0</td>\n",
       "      <td>双胶纸</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>210</td>\n",
       "      <td>技术类</td>\n",
       "      <td>0</td>\n",
       "      <td>双胶纸</td>\n",
       "      <td>62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>206</td>\n",
       "      <td>技术类</td>\n",
       "      <td>0</td>\n",
       "      <td>双胶纸</td>\n",
       "      <td>62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>218</td>\n",
       "      <td>技术类</td>\n",
       "      <td>0</td>\n",
       "      <td>双胶纸</td>\n",
       "      <td>64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>209</td>\n",
       "      <td>技术类</td>\n",
       "      <td>0</td>\n",
       "      <td>双胶纸</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    页数   类别  彩印   纸张  价格\n",
       "0  207  技术类   0  双胶纸  60\n",
       "1  210  技术类   0  双胶纸  62\n",
       "2  206  技术类   0  双胶纸  62\n",
       "3  218  技术类   0  双胶纸  64\n",
       "4  209  技术类   0  双胶纸  60"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel('产品定价模型.xlsx')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "查看各个分类的数据量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "技术类    336\n",
       "教辅类    333\n",
       "办公类    331\n",
       "Name: 类别, dtype: int64"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['类别'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    648\n",
       "1    352\n",
       "Name: 彩印, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['彩印'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "双胶纸    615\n",
       "铜版纸    196\n",
       "书写纸    189\n",
       "Name: 纸张, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['纸张'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2.分类型文本变量处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "将文本内容转为数值，关于LabelEncoder()函数也在将在11.1.2节进行进一步讲解"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "le = LabelEncoder()\n",
    "df['类别'] = le.fit_transform(df['类别'])  # 处理类别"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    336\n",
       "2    333\n",
       "0    331\n",
       "Name: 类别, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将类别一列处理后，我们可以使用value_counts()方法查看转化效果：\n",
    "df['类别'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 另外一种文本内容转为数值的方法，注意不要再运行完上面的代码后运行，因为上面的内容已经被替代完毕了，如果想尝试，需要重新运行，并且，先运行下面的代码\n",
    "# df['类别'] = df['类别'].replace({'办公类': 0, '技术类': 1, '教辅类': 2})  \n",
    "# df['类别'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下面我们使用同样的方法处理“纸张”一列：\n",
    "le = LabelEncoder()\n",
    "df['纸张'] = le.fit_transform(df['纸张'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>页数</th>\n",
       "      <th>类别</th>\n",
       "      <th>彩印</th>\n",
       "      <th>纸张</th>\n",
       "      <th>价格</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>207</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>210</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>206</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>218</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>209</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    页数  类别  彩印  纸张  价格\n",
       "0  207   1   0   1  60\n",
       "1  210   1   0   1  62\n",
       "2  206   1   0   1  62\n",
       "3  218   1   0   1  64\n",
       "4  209   1   0   1  60"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 此时的表格如下：\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3.提取特征变量和目标变量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df.drop(columns='价格') \n",
    "y = df['价格']  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "4.划分训练集和测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "5.模型训练及搭建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
       "                          learning_rate=0.1, loss='ls', max_depth=3,\n",
       "                          max_features=None, max_leaf_nodes=None,\n",
       "                          min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                          min_samples_leaf=1, min_samples_split=2,\n",
       "                          min_weight_fraction_leaf=0.0, n_estimators=100,\n",
       "                          n_iter_no_change=None, presort='auto',\n",
       "                          random_state=123, subsample=1.0, tol=0.0001,\n",
       "                          validation_fraction=0.1, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "model = GradientBoostingRegressor(random_state=123)\n",
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "9.4.3 模型预测及评估"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 71.15004038  79.56199921  68.21751792  90.78788507  78.88479128\n",
      "  42.28022702  39.27334177  60.74670841  53.59744659  77.65931771\n",
      "  80.22295545  76.04437155  79.56199921  58.40372895  79.65245266\n",
      "  44.27997693  53.18177447  35.31452467  92.1798291   58.40372895\n",
      "  41.96644278  99.50466356  80.22295545  79.69648341  91.45061741\n",
      "  42.93885741  42.86973046  75.71824996  48.55203652  62.94185778\n",
      "  39.47077874  61.54190648  95.18389309  51.88118394  65.1293139\n",
      "  50.17577837  39.54495179  83.63542315  56.24632221 102.1176112\n",
      "  48.89080247  49.23639342  33.03502962  52.74862135  35.47220867\n",
      "  35.00370671  53.9446399   74.62364353  35.31452467  53.9446399 ]\n"
     ]
    }
   ],
   "source": [
    "# 模型搭建完毕后，通过如下代码预测测试集数据：\n",
    "y_pred = model.predict(X_test)\n",
    "print(y_pred[0:50])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>预测值</th>\n",
       "      <th>实际值</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>71.150040</td>\n",
       "      <td>75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>79.561999</td>\n",
       "      <td>84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>68.217518</td>\n",
       "      <td>68</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>90.787885</td>\n",
       "      <td>90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>78.884791</td>\n",
       "      <td>85</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         预测值  实际值\n",
       "0  71.150040   75\n",
       "1  79.561999   84\n",
       "2  68.217518   68\n",
       "3  90.787885   90\n",
       "4  78.884791   85"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 通过和之前章节类似的代码，我们可以将预测值和实际值进行对比：\n",
    "a = pd.DataFrame()  # 创建一个空DataFrame \n",
    "a['预测值'] = list(y_pred)\n",
    "a['实际值'] = list(y_test)\n",
    "a.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8741691363311168"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看预测评分 - 方法1：自带的score函数，本质就是R-squared值（也即统计学中常说的R^2)\n",
    "model.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8741691363311168\n"
     ]
    }
   ],
   "source": [
    "# 查看预测评分 - 方法2：r2_score()函数\n",
    "from sklearn.metrics import r2_score\n",
    "r2 = r2_score(y_test, model.predict(X_test))\n",
    "print(r2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.49070203, 0.44718694, 0.04161545, 0.02049558])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看特征重要性\n",
    "model.feature_importances_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>特征名称</th>\n",
       "      <th>特征重要性</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>页数</td>\n",
       "      <td>0.490702</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>类别</td>\n",
       "      <td>0.447187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>彩印</td>\n",
       "      <td>0.041615</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>纸张</td>\n",
       "      <td>0.020496</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  特征名称     特征重要性\n",
       "0   页数  0.490702\n",
       "1   类别  0.447187\n",
       "2   彩印  0.041615\n",
       "3   纸张  0.020496"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 通过DataFrame的方式展示特征重要性\n",
    "features = X.columns  # 获取特征名称\n",
    "importances = model.feature_importances_  # 获取特征重要性\n",
    "\n",
    "# 通过二维表格形式显示\n",
    "importances_df = pd.DataFrame()\n",
    "importances_df['特征名称'] = features\n",
    "importances_df['特征重要性'] = importances\n",
    "importances_df.sort_values('特征重要性', ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "9.2.4 模型参数(选学)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 分类模型，通过如下代码可以查看官方介绍\n",
    "# from sklearn.ensemble import AdaBoostClassifier\n",
    "# AdaBoostClassifier?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 回归模型，通过如下代码可以查看官方介绍\n",
    "# from sklearn.ensemble import AdaBoostRegressor\n",
    "# AdaBoostRegressor?"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
